1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

Data from the speech features

1.2 The data set

TADPOLE_D1_D2 <- read.csv("~/GitHub/BSWiMS/Data/TADPOLE/TADPOLE_D1_D2.csv")
TADPOLE_D1_D2_Dict <- read.csv("~/GitHub/BSWiMS/Data/TADPOLE/TADPOLE_D1_D2_Dict.csv")
TADPOLE_D1_D2_Dict_LR <- as.data.frame(read_excel("~/GitHub/BSWiMS/Data/TADPOLE/TADPOLE_D1_D2_Dict_LR.xlsx",sheet = "LeftRightFeatures"))


rownames(TADPOLE_D1_D2_Dict) <- TADPOLE_D1_D2_Dict$FLDNAME

1.3 Conditioning the data


# mm3 to mm
isVolume <- c("Ventricles","Hippocampus","WholeBrain","Entorhinal","Fusiform","MidTemp","ICV",
              TADPOLE_D1_D2_Dict$FLDNAME[str_detect(TADPOLE_D1_D2_Dict$TEXT,"Volume")]
              )


#TADPOLE_D1_D2[,isVolume] <- apply(TADPOLE_D1_D2[,isVolume],2,'^',(1/3))
TADPOLE_D1_D2[,isVolume] <- TADPOLE_D1_D2[,isVolume]^(1/3)

# mm2 to mm
isArea <- TADPOLE_D1_D2_Dict$FLDNAME[str_detect(TADPOLE_D1_D2_Dict$TEXT,"Area")]
TADPOLE_D1_D2[,isArea] <- sqrt(TADPOLE_D1_D2[,isArea])

# Get only cross sectional measurements
FreeSurfersetCross <- str_detect(colnames(TADPOLE_D1_D2),"UCSFFSX")

# The subset of baseline measurements
baselineTadpole <- subset(TADPOLE_D1_D2,VISCODE=="bl")
table(baselineTadpole$DX)
                   Dementia Dementia to MCI             MCI MCI to Dementia 
          7             336               1             864               5 
  MCI to NL              NL       NL to MCI 
          2             521               1 
table(baselineTadpole$DX_bl)

AD CN EMCI LMCI SMC 342 417 310 562 106


rownames(baselineTadpole) <- baselineTadpole$PTID


validBaselineTadpole <- cbind(DX=baselineTadpole$DX_bl,
                                 AGE=baselineTadpole$AGE,
                                 Gender=1*(baselineTadpole$PTGENDER=="Female"),
                                 ADAS11=baselineTadpole$ADAS11,
                                 ADAS13=baselineTadpole$ADAS13,
                                 MMSE=baselineTadpole$MMSE,
                                 RAVLT_immediate=baselineTadpole$RAVLT_immediate,
                                 RAVLT_learning=baselineTadpole$RAVLT_learning,
                                 RAVLT_forgetting=baselineTadpole$RAVLT_forgetting,
                                 RAVLT_perc_forgetting=baselineTadpole$RAVLT_perc_forgetting,
                                 FAQ=baselineTadpole$FAQ,
                                 Ventricles=baselineTadpole$Ventricles,
                                 Hippocampus=baselineTadpole$Hippocampus,
                                 WholeBrain=baselineTadpole$WholeBrain,
                                 Entorhinal=baselineTadpole$Entorhinal,
                                 Fusiform=baselineTadpole$Fusiform,
                                 MidTemp=baselineTadpole$MidTemp,
                                 ICV=baselineTadpole$ICV,
                                 baselineTadpole[,FreeSurfersetCross])


LeftFields <- TADPOLE_D1_D2_Dict_LR$LFN
names(LeftFields) <- LeftFields
LeftFields <- LeftFields[LeftFields %in% colnames(validBaselineTadpole)]
RightFields <- TADPOLE_D1_D2_Dict_LR$RFN
names(RightFields) <- RightFields
RightFields <- RightFields[RightFields %in% colnames(validBaselineTadpole)]

## Normalize to ICV
validBaselineTadpole$Ventricles=validBaselineTadpole$Ventricles/validBaselineTadpole$ICV
validBaselineTadpole$Hippocampus=validBaselineTadpole$Hippocampus/validBaselineTadpole$ICV
validBaselineTadpole$WholeBrain=validBaselineTadpole$WholeBrain/validBaselineTadpole$ICV
validBaselineTadpole$Entorhinal=validBaselineTadpole$Entorhinal/validBaselineTadpole$ICV
validBaselineTadpole$Fusiform=validBaselineTadpole$Fusiform/validBaselineTadpole$ICV
validBaselineTadpole$MidTemp=validBaselineTadpole$MidTemp/validBaselineTadpole$ICV

leftData <- validBaselineTadpole[,LeftFields]/validBaselineTadpole$ICV
RightData <- validBaselineTadpole[,RightFields]/validBaselineTadpole$ICV

## get mean and relative difference 
meanLeftRight <- (leftData + RightData)/2
difLeftRight <- abs(leftData - RightData)
reldifLeftRight <- difLeftRight/meanLeftRight
colnames(meanLeftRight) <- paste("M",colnames(meanLeftRight),sep="_")
colnames(difLeftRight) <- paste("D",colnames(difLeftRight),sep="_")
colnames(reldifLeftRight) <- paste("RD",colnames(reldifLeftRight),sep="_")


validBaselineTadpole <- validBaselineTadpole[,!(colnames(validBaselineTadpole) %in% 
                                               c(LeftFields,RightFields))]
#validBaselineTadpole <- cbind(validBaselineTadpole,meanLeftRight,difLeftRight,reldifLeftRight)
validBaselineTadpole <- cbind(validBaselineTadpole,meanLeftRight,difLeftRight)

## Remove columns with too many NA more than %15 of NA
nacount <- apply(is.na(validBaselineTadpole),2,sum)/nrow(validBaselineTadpole) < 0.15
diagnose <- validBaselineTadpole$DX
pander::pander(table(diagnose))
AD CN EMCI LMCI SMC
342 417 310 562 106
validBaselineTadpole <- validBaselineTadpole[,nacount]
## Remove character columns
ischar <- sapply(validBaselineTadpole,class) == "character"
validBaselineTadpole <- validBaselineTadpole[,!ischar]
## Place back diagnose
validBaselineTadpole$DX <- diagnose


validBaselineTadpole <- validBaselineTadpole[complete.cases(validBaselineTadpole),]
ischar <- sapply(validBaselineTadpole,class) == "character"
validBaselineTadpole[,!ischar] <- sapply(validBaselineTadpole[,!ischar],as.numeric)

colnames(validBaselineTadpole) <- str_remove_all(colnames(validBaselineTadpole),"_UCSFFSX_11_02_15_UCSFFSX51_08_01_16")
colnames(validBaselineTadpole) <- str_replace_all(colnames(validBaselineTadpole)," ","_")
validBaselineTadpole$LONISID <- NULL
validBaselineTadpole$IMAGEUID <- NULL
validBaselineTadpole$LONIUID <- NULL

diagnose <- as.character(validBaselineTadpole$DX)
validBaselineTadpole$DX <- diagnose
pander::pander(table(validBaselineTadpole$DX))
AD CN EMCI LMCI SMC
245 359 272 444 93


validBaselineTadpole[validBaselineTadpole$DX %in% c("EMCI","LMCI"),"DX"] <- "MCI" 
validBaselineTadpole[validBaselineTadpole$DX %in% c("CN","SMC"),"DX"] <- "NL" 

pander::pander(table(validBaselineTadpole$DX))
AD MCI NL
245 716 452

1.4 Get the Time To Event on MCI Subjects


subjectsID <- rownames(validBaselineTadpole)
visitsID <- unique(TADPOLE_D1_D2$VISCODE)
baseDx <- TADPOLE_D1_D2[TADPOLE_D1_D2$VISCODE=="bl",c("PTID","DX","EXAMDATE")]
rownames(baseDx) <- baseDx$PTID 
baseDx <- baseDx[subjectsID,]
lastDx <- baseDx
toDementia <- baseDx
table(lastDx$DX)
   Dementia Dementia to MCI             MCI MCI to Dementia       MCI to NL 
        244               1             711               2               2 
         NL       NL to MCI 
        452               1 
hasDementia <- lastDx$PTID[str_detect(lastDx$DX,"Dementia")]


for (vid in visitsID)
{
  DxValue <- TADPOLE_D1_D2[TADPOLE_D1_D2$VISCODE==vid,c("PTID","DX","EXAMDATE")]
  rownames(DxValue) <- DxValue$PTID 
  DxValue <- DxValue[DxValue$PTID %in% subjectsID,]
  noDX <- DxValue$PTID[nchar(DxValue$DX) < 1]
  print(length(noDX))
  DxValue[noDX,] <- lastDx[noDX,]
  inLast <- lastDx$PTID[lastDx$PTID %in% DxValue$PTID]
  print(length(inLast))
  lastDx[inLast,] <- DxValue[inLast,]
  noDementia <- !(toDementia$PTID %in% hasDementia)
  toDementia[noDementia,] <- lastDx[noDementia,]
  hasDementia <- unique(c(hasDementia,lastDx$PTID[str_detect(lastDx$DX,"Dementia")]))
}

[1] 0 [1] 1413 [1] 2 [1] 1326 [1] 6 [1] 1218 [1] 23 [1] 1095 [1] 805 [1] 1058 [1] 29 [1] 710 [1] 20 [1] 212 [1] 14 [1] 167 [1] 32 [1] 553 [1] 25 [1] 298 [1] 18 [1] 130 [1] 667 [1] 667 [1] 112 [1] 112 [1] 176 [1] 176 [1] 177 [1] 177 [1] 625 [1] 625 [1] 251 [1] 251 [1] 159 [1] 159 [1] 7 [1] 7 [1] 17 [1] 99 [1] 9 [1] 63 [1] 1 [1] 1

table(lastDx$DX)
   Dementia Dementia to MCI             MCI MCI to Dementia       MCI to NL 
        428               2             463              80               7 
         NL  NL to Dementia       NL to MCI 
        406               1              26 
baseMCI <-baseDx$PTID[baseDx$DX == "MCI"]
lastDementia <- lastDx$PTID[str_detect(lastDx$DX,"Dementia")]
lastDementia2 <- toDementia$PTID[str_detect(toDementia$DX,"Dementia")]
lastNL <- lastDx$PTID[str_detect(lastDx$DX,"NL")]

MCIatBaseline <- baseDx[baseMCI,]
MCIatEvent <- toDementia[baseMCI,]
MCIatLast <- lastDx[baseMCI,]

MCIconverters <- MCIatBaseline[baseMCI %in% lastDementia,]
MCI_No_converters <- MCIatBaseline[!(baseMCI %in% MCIconverters$PTID),]
MCIconverters$TimeToEvent <- (as.Date(toDementia[MCIconverters$PTID,"EXAMDATE"]) 
                                   - as.Date(MCIconverters$EXAMDATE))

sum(MCIconverters$TimeToEvent ==0)

[1] 0



MCIconverters$AtEventDX <- MCIatEvent[MCIconverters$PTID,"DX"]
MCIconverters$LastDX <- MCIatLast[MCIconverters$PTID,"DX"]

MCI_No_converters$TimeToEvent <- (as.Date(lastDx[MCI_No_converters$PTID,"EXAMDATE"]) 
                                   - as.Date(MCI_No_converters$EXAMDATE))

MCI_No_converters$LastDX <- MCIatLast[MCI_No_converters$PTID,"DX"]

MCI_No_converters <- subset(MCI_No_converters,TimeToEvent > 0)

2 Prognosis MCI to AD Conversion

2.1 the set


MCIPrognosisIDs <- c(MCIconverters$PTID,MCI_No_converters$PTID)

TADPOLECrossMRI <- validBaselineTadpole[MCIPrognosisIDs,]
table(TADPOLECrossMRI$DX)

MCI 680

TADPOLECrossMRI$DX <- NULL
TADPOLECrossMRI$status <- 1*(rownames(TADPOLECrossMRI) %in% MCIconverters$PTID)
table(TADPOLECrossMRI$status)

0 1 436 244

2.1.0.1 Standarize the names for the reporting

studyName <- "TADPOLE"
dataframe <- TADPOLECrossMRI
outcome <- "status"

TopVariables <- 10

2.1.1 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
680 327
pander::pander(table(dataframe[,outcome]))
0 1
436 244

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

2.1.2 Scaling the data


dataframe <- FRESAScale(dataframe,method="OrderLogit")$scaledData

2.2 The heatmap of the data



hm <- heatMaps(data=dataframe,
               Outcome=outcome,
               Scale=TRUE,
               hCluster = "row",
               xlab="Feature",
               ylab="Sample",
               cexCol=0.15,
               cexRow=0.25
               )

par(op)

2.2.0.1 Correlation Matrix of the Data

The heat map of the data


par(cex=0.6,cex.main=0.85,cex.axis=0.7)
cormat <- cor(dataframe[,varlist],method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0
gplots::heatmap.2(abs(cormat),
                  trace = "none",
#                  scale = "row",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  main = "Original Correlation",
                  cexRow = 0.15,
                  cexCol = 0.15,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Feature", ylab="Feature")

2.3 The decorrelation

DEdataframe <- IDeA(dataframe)
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

288

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

266

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

4.81

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

4.76

2.3.1 The decorrelation matrix


par(cex=0.6,cex.main=0.85,cex.axis=0.7)

UPSTM <- attr(DEdataframe,"UPSTM")

gplots::heatmap.2(1.0*(abs(UPSTM)>0),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  main = "Decorrelation matrix",
                  cexRow = 0.15,
                  cexCol = 0.15,
                  key.title=NA,
                  key.xlab="|Beta|>0",
                  xlab="Output Feature", ylab="Input Feature")


par(op)

2.4 The heatmap of the decorrelated data


hm <- heatMaps(data=DEdataframe,
               Outcome=outcome,
               Scale=TRUE,
               hCluster = "row",
               cexRow = 0.15,
               cexCol = 0.15,
               xlab="Feature",
               ylab="Sample")

par(op)

2.5 The correlation matrix after decorrelation


cormat <- cor(DEdataframe[,varlistc],method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0
print(max(abs(cormat)))

[1] 0.7992567


gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  main = "Correlation after IDeA",
                  cexRow = 0.15,
                  cexCol = 0.15,
                  key.title=NA,
                  key.xlab="Pearson Correlation",
                  xlab="Feature", ylab="Feature")


par(op)

2.6 U-MAP Visualization of features

2.6.1 The UMAP based on LASSO on Raw Data

classes <- unique(dataframe[,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(scale(dataframe[,varlist]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[,outcome],col=raincolors[dataframe[,outcome]+1])

2.6.2 The decorralted UMAP


datasetframe.umap = umap(scale(DEdataframe[,varlistc]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[,outcome],col=raincolors[DEdataframe[,outcome]+1])

2.7 Univariate Analysis

2.7.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")

100 : M_ST24SA 200 : D_ST49TA 300 : D_ST47CV




univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

100 : M_ST24SA 200 : D_ST49TA 300 : La_D_ST47CV

2.7.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
ADAS13 0.601 0.778 -0.2712 0.755 0.01704 0.788
ADAS11 0.614 0.924 -0.2621 0.825 0.00086 0.761
FAQ 0.832 1.082 -0.0134 0.724 0.00000 0.756
M_ST40CV -0.579 0.905 0.2366 0.821 0.30865 0.750
M_ST29SV -0.501 0.802 0.2761 0.859 0.31867 0.745
M_ST12SV -0.546 0.844 0.2413 0.883 0.35129 0.744
Hippocampus -0.489 0.805 0.2658 0.869 0.22638 0.737
RAVLT_immediate -0.374 0.704 0.3807 0.979 0.02525 0.728
M_ST24CV -0.568 0.959 0.2072 0.876 0.08763 0.727
M_ST31CV -0.496 0.889 0.2225 0.864 0.74725 0.717



finalTable <- univarDe$orderframe[topvar,univariate_columns]
pander::pander(univarDe$orderframe[topvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
ADAS11 0.614 0.924 -0.2621 0.825 0.00086 0.761
FAQ 0.832 1.082 -0.0134 0.724 0.00000 0.756
M_ST40CV -0.579 0.905 0.2366 0.821 0.30865 0.750
M_ST12SV -0.546 0.844 0.2413 0.883 0.35129 0.744
Hippocampus -0.489 0.805 0.2658 0.869 0.22638 0.737
RAVLT_immediate -0.374 0.704 0.3807 0.979 0.02525 0.728
M_ST24CV -0.568 0.959 0.2072 0.876 0.08763 0.727
M_ST31CV -0.496 0.889 0.2225 0.864 0.74725 0.717
M_ST24TA -0.557 0.902 0.1400 0.837 0.06250 0.717
WholeBrain -0.471 0.802 0.1953 0.855 0.21185 0.715

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")

theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)

pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
2.21 33 0.101


dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
  coef <- theFormulas[[dx]]
  cname <- names(theFormulas[[dx]])
  names(cname) <- cname
  for (cf in names(coef))
  {
    if (cf != dx)
    {
      if (coef[cf]>0)
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
      }
      else
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("%5.3f*%s",coef[cf],cname[cf]))
      }
    }
  }
}
orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")


pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
ADAS11 0.614 0.924 -0.2621 0.825 0.00086 0.761 0.761 1
FAQ 0.832 1.082 -0.0134 0.724 0.00000 0.756 0.756 NA
M_ST40CV -0.579 0.905 0.2366 0.821 0.30865 0.750 0.750 NA
M_ST12SV -0.546 0.844 0.2413 0.883 0.35129 0.744 0.744 NA
Hippocampus -0.489 0.805 0.2658 0.869 0.22638 0.737 0.737 1
RAVLT_immediate -0.374 0.704 0.3807 0.979 0.02525 0.728 0.728 NA
M_ST24CV -0.568 0.959 0.2072 0.876 0.08763 0.727 0.727 NA
M_ST31CV -0.496 0.889 0.2225 0.864 0.74725 0.717 0.717 NA
M_ST24TA -0.557 0.902 0.1400 0.837 0.06250 0.717 0.717 NA
WholeBrain -0.471 0.802 0.1953 0.855 0.21185 0.715 0.715 NA